/*
	Estimates the degree of selection bias into the Glassdoor dataset for
	U.S. college graduates employed in the U.S.
	Output:	Figure 1
			Selection_UnitedStates.csv
*/
 
local seed "C:\Users\jsock\Dropbox\Research\GD\International"

local dataPath "`seed'/Data"
local inputPath "`seed'/InputData"
local figurePath "`seed'/Replication/Figures"
local tablePath "`seed'/Replication/Tables"
local estimatePath "`seed'/Replication/Estimates"
local tempPath "`seed'/Replication/TempData"

********************************************************
* Read in country gdp to get universitycountry gdp
********************************************************

clear 
set more off 
set matsize 10000
set scheme s1mono
 
* Set path and load data 
insheet using "`dataPath'/Salaries_international_dataset_main.csv", comma
drop v1

drop if jobtitle == ""

drop metro shortname  
drop city basecurrency country_iso dateval  
drop gender 
drop sectorname iscurrentjobflag 

generate national_rank_pct = national_rank / numberuniversities

*--------------------------------
* Thresholds used for sample selection 
*--------------------------------

scalar country_premia_thresh = 25

scalar selection_thresh = 25

scalar school_thresh = 25

local currentCountry = "United States"
local saveCountry = "UnitedStates"

*--------------------------------
* Exclude users that leave more than 10 reviews
*--------------------------------

sort fk_userid yearofsalary 

by fk_userid : gen userReviews = _N

drop if userReviews > 10

drop userReviews

*--------------------------------
* Keep only used sample
*--------------------------------

drop if school == ""

drop if median_1year_scorecard == .

drop if employertypecode == "SELF_EMPLOYED" 

keep if countryname == "`currentCountry'"

keep if universitycountry == "`currentCountry'"

*--------------------------------
* Convert to 2019 dollars
*--------------------------------

generate base_adj = ppp_xrat if yearofsalary == 2019

egen adj_2019 = max(base_adj)

generate ppp_xrat_2019 = ppp_xrat /  adj_2019

generate logbase = log(basesalary * ppp_xrat_2019)

drop if logbase == .

*--------------------------------
* Exclude outliers in base pay
*--------------------------------

scalar scalingThresh = 10

generate realbase = basesalary * ppp_xrat
generate outside_thresh =  (realbase < (1/scalingThresh) * gdppw) | (realbase > scalingThresh * gdppw) 
drop realbase

drop if outside_thresh

*--------------------------------
* Generate indicators for valid in education analysis
*--------------------------------

generate hasDegree = degree != "UNMATCHED" & degree != "missing" & degree != ""  & degree != "HIGHSCHOOL"

generate uniDegree = degree == "BACHELORS" 

generate hasSchool = school != ""

generate valid_educ = uniDegree & hasDegree & hasSchool & universitycountry != ""

keep if valid_educ 

********************************************************
* Create valid sampels for one and two year comparisons
********************************************************

*--------------------------------
* Years since school ended
*--------------------------------

split endschool, p("-")

destring endschool1, gen(end_year)

generate years_from_graduate = yearofsalary - end_year

generate grad_0year = years_from_graduate == 0
generate grad_1year = years_from_graduate == 1
generate grad_2year = years_from_graduate == 2

generate grad_years = .
replace grad_years = 1 if grad_1year
replace grad_years = 2 if grad_2year

*--------------------------------
* Years since school ended
*--------------------------------

bys school degreescorecard grpmajor : egen avg_grad_1year = median(logbase) if grad_1year

bys school degreescorecard grpmajor : egen avg_grad_2year = median(logbase) if grad_2year

bys school degreescorecard grpmajor grad_years: egen avg_grad = median(logbase) 

bys school degreescorecard grpmajor grad_years: gen n_grad = _N

*--------------------------------
* Compare Scorecard and GD data -- Aggregate to university level
*--------------------------------

preserve

	collapse (max) avg_* median_* count_* n_grad national_rank_pct (sum) grad_1year grad_2year, by(universitycountry school degreescorecard grpmajor grad_years)
	
	* Scorecard log average pay
	generate log_scorecard_1year = log(median_1year_scorecard)
	generate log_scorecard_2year = log(median_2year_scorecard)
	
	* Difference between scorecard and gd - GRADUATE YEAR
	generate diff_grad_1year = avg_grad_1year - log_scorecard_1year	
	generate diff_grad_2year = avg_grad_2year - log_scorecard_2year

	generate grad_diff = diff_grad_1year if grad_years == 1
	replace grad_diff = diff_grad_2year if grad_years == 2
	
	generate grad_n = n_grad if grad_years == 1
	replace grad_n = n_grad if grad_years == 2
	
	generate grad_count = count_1year_scorecard if grad_years == 1
	replace grad_count = count_2year_scorecard if grad_years == 2
	
	bys school degreescorecard : egen n_gd = sum(grad_n)
	bys school degreescorecard : egen n_scorecard = sum(grad_count)
	
	keep if grad_diff != .
	
	keep if degreescorecard == "BACHELORS"
	
	collapse (mean) grad_diff (max) n_gd n_scorecard  national_rank_pct [aw=grad_n], by(universitycountry school degreescorecard )
		
	* Get averages
	sum grad_diff
	local avg_equal = r(mean)
	scalar est_wtd_equal = r(mean)
	
	sum grad_diff [aw=n_gd]
	local avg_wtd_gd = r(mean)
	scalar est_wtd_gd = r(mean)
	scalar sum_wt_wtd_gd = r(sum_w)

		* Number of universities
		tab school if grad_diff != .
		scalar n_wt_wtd_gd = r(r)

	* Get averages for top 5%
	sum grad_diff if national_rank_pct <= 0.05 [aw=n_gd]
	local avg_wtd_gd_top5 = r(mean)
	scalar est_wtd_gd_top5 = r(mean)
	scalar sum_wt_wtd_gd_top5 = r(sum_w)
	
		* Number of universities
		tab school if grad_diff != . & national_rank_pct <= 0.05 
		scalar n_wt_wtd_gd_top5 = r(r)
		
	* Get averages for not top 5%
	sum grad_diff if national_rank_pct > 0.05 | national_rank_pct == . [aw=n_gd]
	local avg_wtd_gd_not5 = r(mean)
	scalar est_wtd_gd_not5 = r(mean)
	scalar sum_wt_wtd_gd_not5 = r(sum_w)
	
		* Number of universities
		tab school if grad_diff != . & national_rank_pct <= 0.05 
		scalar n_wt_wtd_gd_not5 = r(r)
	
	* Weighted and unweighted kernel density
	twoway kdensity grad_diff [aw=n_gd] if inrange(grad_diff,-0.75,0.75), lcolor(gs3) lpattern(dash) xscale(r(-0.75(0.25)0.75)) xlabel(-0.75(0.25)0.75)  ///
		xline(0, lcolor(black) lpattern(solid) lwidth(thin)) fcolor(none) ///
		xtitle("log difference within college") ytitle("density") title("") scale(1.2)
	graph export "`figurePath'/Figure_1.eps"	, replace

	sum grad_diff [aw=n_gd] , detail
	
restore

*--------------------------------
* Export summary of selection estimates
*--------------------------------

preserve

	generate selection_est_equal = est_wtd_equal
	generate selection_est_wtd = est_wtd_gd
	generate selection_est_wtd_top5 = est_wtd_gd_top5
	generate selection_est_wtd_not5 = est_wtd_gd_not5

	generate selection_wtd_N = n_wt_wtd_gd
	generate selection_wtd_N_top5 = n_wt_wtd_gd_top5
	generate selection_wtd_N_not5 = n_wt_wtd_gd_not5

	generate selection_wtd_sum = sum_wt_wtd_gd
	generate selection_wtd_sum_top5 = sum_wt_wtd_gd_top5
	generate selection_wtd_sum_not5 = sum_wt_wtd_gd_not5

	keep universitycountry selection_*
	
	keep if _n == 1
	
	order universitycountry selection_wtd_N* selection_wtd_sum* selection_est_* 
	
	outsheet using "`estimatePath'\Selection_`saveCountry'.csv" , comma replace

restore
